analyzing and writing .docx file

Analyze any `.docx` file for bold, underlined, italicized text

This program helps you find all the bold, underlined and italicized text in a word document.

First create a new folder and then create a file named extract.py inside it and copy paste the code to it. Then you need to install python-docx

$ pip install python-docx

Copy your word document for example, process_design_notes.docx into the current working directory(CWD).

The CWD should now have two files i.e. extract.py and process_design_notes.docx.

Open a terminal or command prompt in CWD and type

#for linux
python3 extract.py process_design_notes.docx
#for windows
python extract.py process_design_notes.docx

After typing above command the program will execute on the word document and append the extracted bold, italicized, underlined words to it.

Source Code: extract.py""" This module uses 'python-docx' package to analyze a Word Document ('.docx')
    This module can be imported in other programs but the preffered way is to run it 
    as a Script from the command line like this:
    
    $ python extract.py <name-of-word-document-file>
    
    The basic function of the program is to analyze the word document and 
    collect bold, italic and underlined words from it and then 
    after analyzing write these collected words at the end of the word document.
    So the program first reads the word document, collects bold, italic 
    and underlined words from it and then writes the collected words at
    the very end of the same word document 
    Copyright 2023 Kashaan Mahmood
    License: MIT License
             https://opensource.org/license/mit/
    """


from docx import Document
from docx.api import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH

# global variables
total_words = 0
wordsList = ""


# calculate total words in docx
def get_total_words(docxFile):
    document = Document(docxFile)
    total = 0
    for p in document.paragraphs:
        for run in p.runs:
            total += len(run.text.split())
    return total


unwanted_characters = [
    '"',
    "'",
    "’",
    "“",
    ":",
    "\n",
    "-",
    "— — ",
    "—",
    ".",
    ",",
    ";",
    "!",
    "?",
]


def remove_unwanted(words):
    """remove unwanted characters from analyzed output"""
    for i in unwanted_characters:
        if i in words:
            words = words.replace(i, "")
    return words


def analyze(docxFile):
    """analyze the docx file and collect bold, italicized, and underlined words from it
    and return a `collect` object these selected words
    """

    document = Document(docxFile)

    collect = [
        {"b": []},
        {"i": []},
        {"u": []},
        {"bi": []},
        {"bu": []},
        {"iu": []},
        {"biu": []},
    ]

    for p in document.paragraphs:
        for run in p.runs:
            if run.bold and run.italic and run.underline:
                filtered_text = remove_unwanted(run.text)
                collect[6]["biu"].append(filtered_text)

            elif run.bold and run.italic:
                filtered_text = remove_unwanted(run.text)
                collect[3]["bi"].append(filtered_text)

            elif run.bold and run.underline:
                filtered_text = remove_unwanted(run.text)
                collect[4]["bu"].append(filtered_text)

            elif run.italic and run.underline:
                filtered_text = remove_unwanted(run.text)
                collect[5]["iu"].append(filtered_text)

            elif run.bold:
                filtered_text = remove_unwanted(run.text)
                collect[0]["b"].append(filtered_text)

            elif run.italic:
                filtered_text = remove_unwanted(run.text)
                collect[1]["i"].append(filtered_text)

            elif run.underline:
                filtered_text = remove_unwanted(run.text)
                collect[2]["u"].append(filtered_text)

    return collect


def write_data(docxFile, data):
    """gets the `collect` variable as 'data' argument from analyze()
    and reads and appends the 'data' to end of docx file
    """

    global wordsList

    document = Document(docxFile)

    def save_document():
        document.save(docxFile)
        return "saved"

    def add_words(key):
        global wordsList
        categories = {
            "b": "\nBold Words:-",
            "i": "\n\nItalicized Words:-",
            "u": "\n\nUnderlined Words:-",
            "bi": "\n\nBold & Italicized Words:-",
            "bu": "\n\nBold & Underlined Words:-",
            "biu": "\n\nBold & Italicized & Underlined Words:-",
            "iu": "\n\nItalicized & Underlined Words:-",
        }
        for word in words[key]:
            category = categories[key]
            if len(wordsList) == 0 or category not in wordsList:
                wordsList = wordsList + f"{category}\n{word}"
            else:
                wordsList = wordsList + f", {word}"

    title_p = document.add_paragraph(f"\n========== Extracted Words ==========\n")

    title_p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER

    write_p = document.add_paragraph()

    for words in data:
        if words.__contains__("b") and words["b"]:
            add_words("b")

        elif words.__contains__("u") and words["u"]:
            add_words("u")

        elif words.__contains__("bi") and words["bi"]:
            add_words("bi")

        elif words.__contains__("bu") and words["bu"]:
            add_words("bu")

        elif words.__contains__("iu") and words["iu"]:
            add_words("iu")

        elif words.__contains__("biu") and words["biu"]:
            add_words("biu")

    write_p.add_run(f"{wordsList}")

    ending_p = document.add_paragraph("\n===================\n")
    ending_p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER

    save_document()
    return


# function calls inside main()


def main():
    global total_words

    data = analyze(docx)
    write_data(docx, data)


if __name__ == "__main__":
    from sys import argv
    import time

    # get docx file
    docx = argv[1]

    print(f"Started at {time.strftime('%X')}...")
    # calling main()
    main()
    print(f"Finished at {time.strftime('%X')}...")

Analyze any .docx file for bold, underlined, italicized text

Source Code: extract.py

Analyze any `.docx` file for bold, underlined, italicized text